Loading necessary libraries

library(dplyr) 
library(ggplot2)
library(knitr)
library(broom)
library(tidyr)
library(progress)
library(pbapply)
pboptions(type='none')
library(dbscan)
library(gridExtra)
library(roahd)

Introduction to Conformal Prediction

The scope of this practice session is to make you familiar with some core and advanced topics in Conformal Prediction. By resorting to synthetic data, we aim to analyze different experimental settings and see how and why conformal inference is a key tool to tackle uncertainty quantification in prediction problems.

Validity

Let us first focus on validity. In this section, we see how a well-known parametric interval fails in producing valid prediction intervals when the observations do not meet the distributional assumptions, while distribution-free conformal inference guarantees distribution-free validity.

Sample from a gaussian distribution

set.seed(2024)
n <- 100
grid_factor <- 1.25
alpha <- .1

y <- rnorm(n,mean=0, sd=1)
hist(y, col='lightblue')

Fisher’s prediction interval

When \(X_1,..., X_n\) and \(X_{n+1}\) are sampled iid from a gaussian distribution \(N(\mu, \sigma^2)\), with \(\mu\) and \(\sigma^2\) unknown, then we know that

\[\frac{X_{n+1} - \bar{X}}{s \sqrt{1+\frac{1}{n}}} \sim t(n-1)\] Then, the prediction interval of level \(1-\alpha\) for \(X_{n+1}\) is the Fisher’s prediction interval and has the form

\[C_{1-\alpha}\left( \bar{X} - s \sqrt{1+\frac{1}{n}} t_{\alpha/2}(n-1), \bar{X} + s \sqrt{1+\frac{1}{n}} t_{\alpha/2}(n-1)\right)\] The prediction interval is exact in the sense that

\[\mathbb{P} \left(X_{n+1} \in C_{1-\alpha} \right) = 1 - \alpha\] We can compute by hand the Fisher’s prediction interval

wrapper_param <- function(grid_point, y){
  n  <- length(y)
  t0 <- (grid_point - mean(y))/sd(y)/sqrt(1+1/n)
  2*(1-pt(abs(t0), n-1))
}

# Create a grid of test points
n_grid <- 200
test_grid <- seq(-grid_factor*max(abs(y)),+grid_factor*max(abs(y)),length.out=n_grid)
pval_fun <- sapply(test_grid, wrapper_param, y)
plot(test_grid,pval_fun,type='l')

The prediction interval is found as the set of points corresponding to p-value greater than \(\alpha\).

index_in <- pval_fun>alpha
PI.param <- range(test_grid[index_in])

Conformal prediction set

By resorting to conformal inference in the full conformal setting, we now construct the distribution-free counterpart of the Fisher’s prediction interval (indeed, we are considering the same non-conformity measure as before!).

wrapper_full <- function(grid_point, y){
  aug_y <- c(grid_point,y)
  ncm <- numeric(length(aug_y))
  for (i in 1:length(aug_y)) {
    ncm[i] <- abs(aug_y[i] - mean(aug_y[-i]))/sd(aug_y[-i])/sqrt(1+1/length(aug_y[-i]))
  }
  sum(ncm>=ncm[1])/(length(aug_y))
}

# Create a grid of test points
n_grid <- 200
test_grid <- seq(-grid_factor*max(abs(y)),+grid_factor*max(abs(y)),length.out=n_grid)
pval_fun <- sapply(test_grid, wrapper_full, y)
plot(test_grid,pval_fun,type='l')

Again, the prediction interval is found as the set of points corresponding to p-value greater than \(\alpha\).

index_in <- pval_fun>alpha
PI <- range(test_grid[index_in])

# Plot the prediction intervals
plot(test_grid,pval_fun,type='l')
abline(v=PI,col='blue') # Conformal prediction interval
abline(v=PI.param, col='darkorange') # Fisher's prediction interval
legend("topright", legend=c("Conformal", "Fisher's"), col=c("blue", "darkorange"),
       lty=1, cex=0.8)

hist(y, col='lightblue')
abline(v=PI,col='blue') # Conformal prediction interval
abline(v=PI.param, col='darkorange') # Fisher's prediction interval
legend("topright", legend=c("Conformal", "Fisher's"), col=c("blue", "darkorange"),
       lty=1, cex=0.8)

Are the prediction sets valid?

We test it via repeated experiments.

alpha <- .1
n <- 100
n.test <- 100

n.rep <- 100
param.coverage <- numeric(n.rep)
conf.coverage <- numeric(n.rep)

pb <- progress::progress_bar$new(
  format=" MC simulation [:bar] :percent in :elapsed",
  total=n.rep, clear=FALSE, width=60)
for(i in 1:n.rep){
  set.seed(2024+i)

  y <- rnorm(n, mean=0, sd=1)
  y.test <- rnorm(n.test, mean=0, sd=1)

  # Fisher's prediction interval
  pval_fun <- pbsapply(y.test,wrapper_param,y)
  param.coverage[i] <- mean(pval_fun>alpha)
  
  # Conformal prediction interval
  pval_fun <- pbsapply(y.test,wrapper_full,y)
  conf.coverage[i] <- mean(pval_fun>alpha)
  
  pb$tick()
}

# Display results
coverage <- c(conf.coverage, param.coverage)
method <- rep(c("Conformal","Fisher's"), each=n.rep)
data <- data.frame(method, coverage)

pgplot <- ggplot(data, aes(x=method, y=coverage, fill=method)) + 
  geom_boxplot() +
  scale_y_continuous(limits=c(0.60,1.00)) +
  theme_bw() + 
  labs(x="", y="Marginal coverage", fill="Method")
pgplot

Sample from a heavy-tailed distribution

Let us repeat the same procedure when the data are not sampled from a gaussian distribution. For instance, we sample from a t-student distribution.

set.seed(2024)
n <- 100
y <- rt(n, df=2)
hist(y, col='lightblue')

Fisher’s prediction interval

n_grid <- 200
test_grid <- seq(-grid_factor*max(abs(y)),+grid_factor*max(abs(y)),length.out=n_grid)
pval_fun <- sapply(test_grid,wrapper_param,y)
plot(test_grid,pval_fun,type='l')

index_in <- pval_fun>alpha
PI.param <- range(test_grid[index_in])

Conformal prediction interval

n_grid <- 200
test_grid <- seq(-grid_factor*max(abs(y)),+grid_factor*max(abs(y)),length.out=n_grid)
pval_fun <- sapply(test_grid,wrapper_full,y)
plot(test_grid,pval_fun,type='l')

index_in <- pval_fun>alpha
PI <- range(test_grid[index_in])

# Plot the prediction intervals
plot(test_grid,pval_fun,type='l')
abline(v=PI,col='blue') # Conformal prediction interval
abline(v=PI.param, col='darkorange') # Fisher's prediction interval
legend("topright", legend=c("Conformal", "Fisher's"), col=c("blue", "darkorange"),
       lty=1, cex=0.8)

hist(y, col='lightblue')
abline(v=PI,col='blue') # Conformal prediction interval
abline(v=PI.param, col='darkorange') # Fisher's prediction interval
legend("topright", legend=c("Conformal", "Fisher's"), col=c("blue", "darkorange"),
       lty=1, cex=0.8)

What about validity now?

alpha <- .1
n <- 100
n.test <- 100
n_grid <- 200
grid_factor <- 3

n.rep <- 100
param.coverage <- numeric(n.rep)
conf.coverage <- numeric(n.rep)

pb <- progress::progress_bar$new(
  format=" MC simulation [:bar] :percent in :elapsed",
  total=n.rep, clear=FALSE, width=60)
for(i in 1:n.rep){
  
  set.seed(2024+i)

  y <- rt(n, df=2)
  y.test <- rt(n.test, df=2)

  # Fisher's prediction interval
  pval_fun <- pbsapply(y.test,wrapper_param,y)
  param.coverage[i] <- mean(pval_fun>alpha)
  
  # Conformal prediction interval
  pval_fun <- pbsapply(y.test,wrapper_full,y)
  conf.coverage[i] <- mean(pval_fun>alpha)
  
  pb$tick()
}

# Display results
coverage <- c(conf.coverage, param.coverage)
method <- rep(c("Conformal","Fisher's"), each=n.rep)
data <- data.frame(method, coverage)

pgplot <- ggplot(data, aes(x=method, y=coverage, fill=method)) + 
  geom_boxplot() +
  scale_y_continuous(limits=c(0.60,1.00)) +
  theme_bw() + 
  labs(x="", y="Marginal coverage", fill="Method")
pgplot

Efficiency

The conformal prediction machinery guarantees validity of the resulting prediction set. The choice of ad-hoc non-conformity measures, on the other hand, may improve the efficiency of the prediction regions (namely, reduce their width).

Let us sample the data from a bimodal distribution, and repeat the construction of the conformal prediction set when employing as non-conformity measures:

  • the absolute value of the difference from the mean,

  • the average eucledian distance from the k-nearest neighbors.

We aim to assess how the amplitude of the prediction set changes when adopting one or the other non-conformity measure.

n <- 100
grid_factor <- 1.25
alpha <- .1

set.seed(2024)
y <- c(rnorm(n/2,mean=-2.6),rnorm(n/2,mean=2.6))
hist(y, col='lightblue')

Absolute value of the difference from the mean

n_grid <- 1000
test_grid <- seq(-grid_factor*max(abs(y)),+grid_factor*max(abs(y)),length.out=n_grid)
pval_fun <- sapply(test_grid,wrapper_full,y)
plot(test_grid,pval_fun,type='l')

and compute the prediction interval, as before

index_in <- pval_fun>alpha
PI.abs <- range(test_grid[index_in])

Average eucledian distance from the K-nn

wrapper_knn <- function(grid_point,y){
  k_s <- 0.25
  aug_y <- c(grid_point,y)
  ncm <- kNNdist(matrix(aug_y),k_s*length(y))
  sum((ncm[-1]>=ncm[1]))/length(aug_y)
}

pval_fun <- pbsapply(test_grid,wrapper_knn,y)
plot(test_grid,pval_fun,type='l')

Now the p-value function is not unimodal, so we cannot simply take the range of the test points corresponding to a p-value larger than \(\alpha\). The generalization is rather straightforward though.

index_in <- pval_fun>alpha
PI.knn <- test_grid[as.logical(c(0,abs(diff(index_in))))]

Visualize now the resulting prediction intervals:

# Plot the prediction intervals
hist(y, col='lightblue')
abline(v=PI.abs,col='blue') # Conformal PI with "classical" ncm
abline(v=PI.knn, col='darkred') # Conformal PI with k-nn ncm
legend("topright", legend=c("PI abs", "PI k-nn"), col=c("blue", "darkred"),
       lty=1, cex=0.8)

Again, one might repeat the experiment to check that the conformal PI with the k-nn mean eucledian distance is consistently more efficient than the competitor, though both being valid.

alpha <- .1
n <- 100
n.test <- 100
grid_factor <- 1.25
n_grid <- 200

n.rep <- 100
abs.coverage <- numeric(n.rep)
knn.coverage <- numeric(n.rep)
abs.size <- numeric(n.rep)
knn.size <- numeric(n.rep)

set.seed(2024)
pb <- progress::progress_bar$new(
  format=" MC simulation [:bar] :percent in :elapsed",
  total=n.rep, clear=FALSE, width=60)
for(i in 1:n.rep){
  
  y <- c(rnorm(n/2,mean=-2.6),rnorm(n/2,mean=2.6))
  y.grid <- seq(-grid_factor*max(abs(y)),+grid_factor*max(abs(y)),length.out=n_grid)

  y.test <- c(rnorm(n.test/2,mean=-2.6),rnorm(n.test/2,mean=2.6))
  y.test <- sort(y.test)
  
  ## Absolute difference from the mean
  pval_fun <- pbsapply(y.grid,wrapper_full,y)
  index_in <- pval_fun>alpha
  PI.abs <- range(y.grid[index_in])
  
  # Efficiency
  abs.size[i] <- PI.abs[2] - PI.abs[1]
  
  # Validity
  abs.coverage[i] <- mean((y.test >= PI.abs[1])*(y.test <= PI.abs[2]))
  
  ## K-nn
  pval_fun <- pbsapply(y.grid,wrapper_knn,y)
  index_in <- pval_fun>alpha
  PI.knn <- y.grid[as.logical(c(index_in[1],abs(diff(index_in)),tail(index_in,n=1)))]
  PI.knn_matrix <- matrix(PI.knn, ncol = 2, byrow = TRUE)
  
  # Efficiency
  knn.size[i] <- sum(PI.knn_matrix[,2]-PI.knn_matrix[,1])
  
  # Validity
  check_in <- rep(FALSE, n.test)
  for(j in 1:nrow(PI.knn_matrix)){
    check_in <- check_in + (y.test >= PI.knn_matrix[j,1])*(y.test <= PI.knn_matrix[j,2])
  }
  knn.coverage[i] <- mean(check_in)
  
  pb$tick()
}

# Display results
coverage <- c(abs.coverage, knn.coverage)
method <- rep(c("abs","k-nn"), each=n.rep)
data.coverage <- data.frame(method, coverage)

size <- c(abs.size, knn.size)
method <- rep(c("abs","k-nn"), each=n.rep)
data.size <- data.frame(method, size)

p1 <- ggplot(data.coverage, aes(x=method, y=coverage, fill=method)) + 
  geom_boxplot() +
  scale_y_continuous(limits=c(0.60,1.00)) +
  theme_bw() + 
  labs(x="", y="Marginal coverage", fill="Method")

p2 <- ggplot(data.size, aes(x=method, y=size, fill=method)) + 
  geom_boxplot() +
  scale_y_continuous(limits=c(5,9)) +
  theme_bw() + 
  labs(x="", y="Average size", fill="Method")

grid.arrange(p1, p2, ncol = 2)

Multivariate Conformal Prediction

The theory behind Conformal Prediction works for objects belonging to metric spaces, thus having a lot of generality.

Let us generate a sample from a bivariate T-student distribution.

set.seed(2024+1)
n=40
y_biv=cbind(rt(n,2), rt(n,2))
plot(y_biv[,1],y_biv[,2])

Full Conformal

Once the non-conformity measure is properly defined, the only difference with respect to the univariate case is that the p-value function is evaluated on a plane, rather than on a line.

n_grid=200
test_grid_x=seq(-grid_factor*max(abs(y_biv[,1])),+grid_factor*max(abs(y_biv[,1])),length.out = n_grid)
test_grid_y=seq(-grid_factor*max(abs(y_biv[,2])),+grid_factor*max(abs(y_biv[,2])),length.out = n_grid)
xy_surface=expand.grid(test_grid_x,test_grid_y)

wrapper_multivariate_full <- function(grid_point, y_biv){
  aug_y_biv <- rbind(grid_point,y_biv)
  ncm <- numeric(dim(aug_y_biv)[1])
  for (i in 1:length(ncm)) {
    ncm[i] <- sum((aug_y_biv[i,]-colMeans(aug_y_biv[-i,]))^2) #Squared euclidean distance from the mean
    #ncm[i] <- mahalanobis(aug_y_biv[i,], colMeans(aug_y_biv[-i,]), cov=cov(aug_y_biv[-i,])) # Mahalanobis distance
  }
  sum(ncm>=ncm[1])/(length(ncm))
}

pval_surf=pbapply(xy_surface, 1, wrapper_multivariate_full, y_biv)

Plot the p-value surface

data_plot=cbind(pval_surf,xy_surface)

ggplot() + 
scale_color_continuous()+
geom_tile(data=data_plot, aes(Var1, Var2, fill= pval_surf)) +
geom_point(data=data.frame(y_biv), aes(X1,X2)) + 
  ylim(-5,5)+
  xlim(-5,5)

Plot of the bivariate prediction set

p_set=xy_surface[pval_surf>alpha,]
poly_points=p_set[chull(p_set),]

ggplot() + 
  geom_tile(data=data_plot, aes(Var1, Var2, fill= pval_surf)) +
  geom_point(data=data.frame(y_biv), aes(X1,X2)) + 
  geom_polygon(data=poly_points,aes(Var1,Var2),color='red',size=1,alpha=0.01)+
  ylim(-5,5)+
  xlim(-5,5)

Split Conformal

i1=sample(1:n,n/2)
t_set=y_biv[i1,]
c_set=y_biv[-i1,]
mu=colMeans(t_set)
ncm=sqrt(rowSums(c_set-mu)^2) # Eculidean distance from the mean
ncm_sort=c(sort(ncm),Inf)
d=ncm_sort[ceiling((n/2 + 1)*(1-alpha))]

The prediction set in this case is identified by the portion of the plane where \(d_euc(\mu,y)>d\). This becomes evident (at least visually) if we plot the bivariate prediction set.

plot(y_biv,xlim = c(-10,10),ylim = c(-10,10))

angle_grid=seq(0,(2*pi),length.out = 1000)
circle_points=data.frame(x=d*sin(angle_grid) + mu[1],y=d*cos(angle_grid) + mu[2])

Recall that, whenever you use a split conformal algorithm, there is some randomness in the resulting prediction set, which is due to how the dataset is split in training and calibration set.

plot(y_biv,xlim = c(-10,10),ylim = c(-10,10))

i1=sample(1:n,n/2)
t_set=y_biv[i1,]
c_set=y_biv[-i1,]
mu=colMeans(t_set)
ncm=sqrt(rowSums(c_set-mu)^2)
ncm_sort=c(sort(ncm),Inf)
d=ncm_sort[ceiling((n/2 + 1)*(1-alpha))]
circle_points=data.frame(x=d*sin(angle_grid) + mu[1],y=d*cos(angle_grid) + mu[2])
polygon(circle_points,border='blue')

i1=sample(1:n,n/2)
t_set=y_biv[i1,]
c_set=y_biv[-i1,]
mu=colMeans(t_set)
ncm=sqrt(rowSums(c_set-mu)^2)
ncm_sort=c(sort(ncm),Inf)
d=ncm_sort[ceiling((n/2 + 1)*(1-alpha))]
circle_points=data.frame(x=d*sin(angle_grid) + mu[1],y=d*cos(angle_grid) + mu[2])
polygon(circle_points,border='red')

i1=sample(1:n,n/2)
t_set=y_biv[i1,]
c_set=y_biv[-i1,]
mu=colMeans(t_set)
ncm=sqrt(rowSums(c_set-mu)^2)
ncm_sort=c(sort(ncm),Inf)
d=ncm_sort[ceiling((n/2 + 1)*(1-alpha))]
circle_points=data.frame(x=d*sin(angle_grid) + mu[1],y=d*cos(angle_grid) + mu[2])
polygon(circle_points,border='green')

Conformal Prediction in Regression

Let us consider now the particular case of regression, where the point predictor for a new observation \(Y_{n+1}\) of the response variable is given by the OLS estimate obtained on the training data, applied to the observed predictors \(X_{n+1}\). We compare the prediction intervals obtained under parametric assumptions with those obtained via conformal inference in the split conformal fashion.

The key idea in the split conformal framework is to compute a conformity score for each observation in the calibration set, measuring the discrepancy between the true value of Y and that predicted by the linear model. The model fitted on the training data is applied to hold-out calibration samples, producing a collection of conformity scores. As all data points are exchangeable, the empirical distribution of the calibration scores can be leveraged to make predictive inferences about the conformity score of a new test point. Finally, inverting the function defining the conformity scores yields a prediction set for the test \(Y_{n+1}\).

In the following, we will consider cases when the parametric assumptions are met, and cases when the parametric assumptions are violated.

To generate each scenario, we will resort the following data generation function:

sample_data <- function(model=c("homoscedastic", "heavy-tailed", "heteroscedastic"),
                        n,
                        seed)
{
  set.seed(seed)
  if(model=="homoscedastic"){
    x <- runif(n,0,1)
    epsilon <- 0.2*rnorm(n,0,1)
    y <- x + epsilon
  }
  
  if(model=="heavy-tailed"){
    x <- runif(n,0,1)
    epsilon <- 0.2*rt(n, df=2)
    y <- x + epsilon
  }
  
  if(model=="heteroscedastic"){
    x <- runif(n,0,1)
    epsilon <- 0.2 * (x^2) * rnorm(n,0,1) 
    y <- x + epsilon
  }
  
  out <- list("x" = x,
              "y" = y)
  return(out)
}

Linear regression with homoscedastic residuals

We generate data from a toy model with one explanatory variable and one response variable.

seed <- 2023
n <- 1000

data <- sample_data(model="homoscedastic", n, seed)
x <- data$x
y <- data$y

par(mfrow=c(1,2))
hist(y, breaks=10, col='lightblue', main="Histogram of y values")
plot(x, y, col='lightblue', main="Data")

# Fit linear model with ols
mod <- lm(y ~ x, data)

# Display the regression function
x.grid <- data.frame("x"=seq(0,1,length.out=1000))
y.grid.hat <- predict(mod, x.grid)
par(mfrow=c(1,1))
plot(x, y, col='lightblue', main="Data and linear regession function")
lines(x.grid$x, y.grid.hat, type='l', col='black', lwd=2)

Parametric prediction intervals

We can use linear regression theory to compute prediction intervals, and define a function to evaluate the coverage and width of the prediction set on test data.

evaluate_predictions <- function(x.test, y.test, x.grid, lower, upper, view_plot=T){
  covered <- (y.test >= lower)*(y.test<=upper)
  coverage <- mean(covered)
  width <- mean(upper-lower)
  
  if(view_plot){
    idx.sort <- sort(x.test$x, index.return=TRUE)$ix
    plot(x.test$x[idx.sort], y.test[idx.sort], col='lightblue', main=paste0("Prediction interval, alpha=",alpha, ", coverage=", round(coverage,2), ", width=", round(width,2)),
     xlab="x test", ylab="y test")
    lines(x.test$x[idx.sort], lower[idx.sort], lty=3, col='black', lwd=2)
    lines(x.test$x[idx.sort], upper[idx.sort], lty=3, col='black', lwd=2)
  }
  
  out <- list("coverage"=coverage,
              "width"=width)
  return(out)
}

n.test <- 1000
data.test <- sample_data(model="homoscedastic", n.test, seed+n)
x.test <- data.test$x
y.test <- data.test$y
x.test <- data.frame("x"=x.test)

# Nominal significance level
alpha <- .1
pi.param <- predict(mod, x.test, interval="prediction", level=1-alpha)

performance <- evaluate_predictions(x.test, y.test, lower=pi.param[,'lwr'], upper=pi.param[,'upr'])

Conformal prediction intervals

#' Compute conformal prediction interval
reg_split_conformal <- function(x, y, x.test, alpha){
  n <- length(y)
  n.train <- floor(n/2)
  n.calib <- n-n.train
  
  # Split the data into training and calibrations sets
  idxs.train <- sample(1:n, size=n.train)
  
  x.train <- x[idxs.train]
  y.train <- y[idxs.train]
  x.calib <- x[-idxs.train]
  y.calib <- y[-idxs.train]
  
  data <- data.frame("x"=x.train, "y"=y.train)
  mod <- lm(y~x, data)
  ncs.calib <- abs(y.calib - predict(mod, data.frame("x"=x.calib)))
  ncs.quantile <- quantile(ncs.calib, prob=(1-alpha)*(n.calib+1)/n.calib)
  
  # Construct the prediction interval
  x.test <- data.frame("x"=x.test)
  y.hat <- predict(mod, x.test)
  lower_bound <- y.hat - ncs.quantile
  upper_bound <- y.hat + ncs.quantile
  
  out <- list("lower_bound"=lower_bound,
              "upper_bound"=upper_bound)
  
}

# Nominal significance level
alpha <- .1
pi.conformal <- reg_split_conformal(x, y, x.test, alpha)

performance <- evaluate_predictions(x.test, y.test, lower=pi.conformal$lower_bound, upper=pi.conformal$upper_bound)

Linear regression with heavy tailed residuals

Let us now consider data generated by a linear regression model with heavy tailed residuals.

n <- 1000

data <- sample_data(model="heavy-tailed", n, seed)
x <- data$x
y <- data$y

par(mfrow=c(1,2))
hist(y, breaks=10, col='lightblue', main="Histogram of y values")
plot(x, y, col='lightblue', main="Data")

# Fit linear model with ols
mod <- lm(y ~ x, data)

# Display the regression function
x.grid <- data.frame("x"=seq(0,1,length.out=1000))
y.grid.hat <- predict(mod, x.grid)
par(mfrow=c(1,1))
plot(x, y, col='lightblue', main="Data and linear regession function")
lines(x.grid$x, y.grid.hat, type='l', col='black', lwd=2)

Again, we use linear regression theory to compute prediction intervals, and evaluate the coverage and width of the prediction set on test data.

n.test <- 1000
data.test <- sample_data(model="heavy-tailed", n.test, seed+n)
x.test <- data.test$x
y.test <- data.test$y
x.test <- data.frame("x"=x.test)

# Nominal significance level
alpha <- .1
pi.param <- predict(mod, x.test, interval="prediction", level=1-alpha)

performance.param <- evaluate_predictions(x.test, y.test, lower=pi.param[,'lwr'], upper=pi.param[,'upr'])

pi.conformal <- reg_split_conformal(x, y, x.test, alpha)

performance.conformal <- evaluate_predictions(x.test, y.test, lower=pi.conformal$lower_bound, upper=pi.conformal$upper_bound)

Linear regression with heteroscedastic residuals

We now repeat the same analysis on data generated with a linear regression model with heteroscedastic residuals.

n <- 1000

data <- sample_data(model="heteroscedastic", n, seed)
x <- data$x
y <- data$y

par(mfrow=c(1,2))
hist(y, breaks=10, col='lightblue', main="Histogram of y values")
plot(x, y, col='lightblue', main="Data")

# Fit linear model with ols
mod <- lm(y ~ x, data)

# Display the regression function
x.grid <- data.frame("x"=seq(0,1,length.out=1000))
y.grid.hat <- predict(mod, x.grid)
par(mfrow=c(1,1))
plot(x, y, col='lightblue', main="Data and linear regession function")
lines(x.grid$x, y.grid.hat, type='l', col='black', lwd=2)

Again, we use linear regression theory to compute prediction intervals, and evaluate the coverage and width of the prediction set on test data.

n.test <- 1000
data.test <- sample_data(model="heteroscedastic", n.test, seed+n)
x.test <- data.test$x
y.test <- data.test$y
x.test <- data.frame("x"=x.test)

# Nominal significance level
alpha <- .1
pi.param <- predict(mod, x.test, interval="prediction", level=1-alpha)

performance.param <- evaluate_predictions(x.test, y.test, lower=pi.param[,'lwr'], upper=pi.param[,'upr'])

pi.conformal <- reg_split_conformal(x, y, x.test, alpha)

performance.conformal <- evaluate_predictions(x.test, y.test, lower=pi.conformal$lower_bound, upper=pi.conformal$upper_bound)

RECALL: Conformal prediction guarantees marginal validity in finite samples! But an ideal prediction interval should be: * valid in finite samples, without making strong distributional assumptions * the shortest possible at each point in the covariates space, for the prediction to be informative.

Toward conditional validity

Current research is going toward solving the issue of adjusting the prediction intervals according to the local variability of the responses. In this direction, binning is one of the possible solutions toward conditional validity.

#' Compute conformal prediction interval
binning_conformal <- function(x, y, x.test, y.test, alpha, nbins, view_plot=T){
  
  x_bins <- seq(0,1,length.out=nbins+1)
  lowers <- list()
  uppers <- list()
  bins_x.test <- list()
  bins_y.test <- list()
  
  coverages <- NULL
  widths <- NULL
  
  for(j in 1:(length(x_bins)-1)){
    binj <- as.logical((x>=x_bins[j])*(x<=x_bins[j+1]))
    binj_test <- as.logical((x.test[,1]>=x_bins[j])*(x.test[,1]<=x_bins[j+1]))
    
    pi.conformal <- reg_split_conformal(x[binj], y[binj], x.test[binj_test,1], alpha)
    lowers[[j]] <- pi.conformal$lower_bound
    uppers[[j]] <- pi.conformal$upper_bound
    bins_x.test[[j]] <- x.test[binj_test,1]
    bins_y.test[[j]] <- y.test[binj_test]
  }
  
  if(view_plot){
    lower <- unlist(lowers)
    upper <- unlist(uppers)
    x.plot <- unlist(bins_x.test)
    y.plot <- unlist(bins_y.test)
    idx.sort <- sort(x.plot, index.return=TRUE)$ix
    plot(x.plot[idx.sort], y.plot[idx.sort], col='lightblue', main=paste0("Binned PI, alpha=",alpha, ", nbins=",nbins),
     xlab="x test", ylab="y test")
    lines(x.plot[idx.sort], lower[idx.sort], lty=3, col='black', lwd=2)
    lines(x.plot[idx.sort], upper[idx.sort], lty=3, col='black', lwd=2)
  }
  
}

# Nominal significance level
alpha <- .1
pi.conformal <- binning_conformal(x, y, x.test, y.test, alpha, nbins=3)

Note that having validity in each bin implies having global validity. And if we increased the number of bins

par(mfrow=c(2,3))
for(nbins in 3:8){
  pi.conformal <- binning_conformal(x, y, x.test, y.test, alpha, nbins=nbins)
}

The bins cannot be too small. On the contrary, there is a trade-off between achieving conditional validity and having enough datapoints in each bin to well-calibrate local prediction intervals.

On-going research on the topic concerns:

  • Locally adaptive split conformal methods

  • Conformalized Quantile Regression